Loading Necessary Packages

library(data.table)
library(reshape2)
library(dplyr)
library(tidyverse)
library(ComplexHeatmap)
library(ggplot2)
library(circlize)
library(RColorBrewer)
library(viridis)
library(colorspace)

Loading necessary code (CHANGE PATHS)

#courtesy from @Rich Scriven https://stackoverflow.com/questions/26045478/source-r-scripts-in-different-folders
sourceFolder <- function(folder, recursive = FALSE, ...) 
{ 
    files <- list.files(folder, pattern = "[.][rR]$", 
                        full.names = TRUE, recursive = recursive)
    if (!length(files))
        stop(simpleError(sprintf('No R files in folder "%s"', folder)))
    src <- invisible(lapply(files, source, ...))
    message(sprintf('%s files sourced from folder "%s"', length(src), folder))
}

sourceFolder(
  "/mnt/sda/alberto/projects/dev/comparABle/code/functions/", # change for definitive path once its final
  recursive = TRUE
  )
# source("/mnt/sda/alberto/projects/dev/minitools_R/code/functions/")

source('./code/R_functions/TF_analysis_functions/barplotcluster.R') # change for definitive path once its final

Data Load

We start with our previous session.

load(
  "./outputs/rda/01_plei_dataload.rda"
)

We will also load the TF annotation of Pristina leidyi.

plei_tfs <- read.table(
  "./data/03_TFdata/20221025_pristina_TFs_curated.tsv",
  header = T)[,1:2] #load column of plei ID and TF class
head(plei_tfs)
##                  id class
## 1 PrileiEVm006188t1  A_20
## 2 PrileiEVm016861t1  A_20
## 3 PrileiEVm018456t1  A_20
## 4 PrileiEVm018745t1  A_20
## 5 PrileiEVm007500t1  AP_2
## 6 PrileiEVm000374t1  ARID

We subset the gene expression pseudo-bulk matrix to retrieve expression from the TFs.

plei_tfs_cpm <-
  plei_cpm[
    rownames(plei_cpm) %in% plei_tfs$id,
    1:49
    ]

colnames(plei_tfs_cpm) <- colnames(plei_cpm)[1:49]

plei_tfs_cpm <- as.data.frame(plei_tfs_cpm)

plei_tfs_cpm_hvg <- 
  plei_tfs_cpm[
    rownames(plei_tfs_cpm) %in% plei_hvg$id,
    ]

We can browse the expression level of different TFs using this function.

plot_tf_plei <- function(x){
  if(x %in% rownames(plei_tfs_cpm)) {
    barplot(
      height=unlist(c(
        plei_tfs_cpm[
          grep(
            paste("^",x,"$",sep=""),
            rownames(plei_tfs_cpm),
          ),
          1:49
        ]
      )),
      col = plei_ctypes_col$col,
      border = "#2F2F2F",
      las=2,
      cex.names=0.7,
      main= paste(
        x,
        " (",
        plei_tfs[grep(x,plei_tfs$id),2],
        ")\n",
        sep=""
      ),
      ylab="counts per million per cluster"
    )} else {
      stop("Name not in list of TFs.")
    }
}

For example, the hnf4 gene;

hnf4 <- "PrileiEVm006891t1"
plot_tf_plei(hnf4)

# -gut,hnf4 : 006891, 004665, 008019
# -muscle: myoD: 008071, 014648
# -neurons: myt1l: 000431, pou6f :003917
# -UMAP feature of novel cell types:
#   -lipoxygenase+ 001926
# -globin+ 002870 005469
# -eleocytes 005230 – ets4
# -vigilin+ 005896, 006907
# -polycystin 010161

As we have expression data of many transcription factors, we can visualise the global patterns of expression using heatmaps. We will do so by scaling the log-transformed expression of TFs to obtain a z-score.

plei_tfs_zsco <- 
  t(
    scale(
      t(
        log(plei_tfs_cpm[,1:49]+1) # we add 1 for visualisation only
        )
      )
    )

And using the ComplexHeatmap package:

Analysing TFs at the TF class level

We can explore the variability of TFs based at a TF class level.

# boxplot CV per TF class

plei_tfs_class_cv <- merge(
  data.frame(
    id = plei_tfs$id,
    class = factor(
      plei_tfs$class,
      levels = unique(plei_tfs$class)
      )
    ),
  data.frame(
    id = rownames(plei_tfs_cpm),
    cv = apply(
      plei_tfs_cpm[,1:49],
      1,
      function(x){
        sd(x)/mean(x)
      }
    )
  ),
  by = 1
)[,2:3]

head(plei_tfs_class_cv)
##     class        cv
## 1 zf_C2H2 0.5658257
## 2 zf_C2H2 1.5710520
## 3     DDT 0.7542196
## 4     MYB 0.7197607
## 5    bHLH 0.4712681
## 6    bHLH 1.3131212
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## Don't know how to automatically pick scale for object of type table. Defaulting to continuous.

With this we observe that there are a number of TF classes with very few assigned genes in Pristina.

We can focus on those with a minimum of four or five genes.

plei_tfs_mainclasses <- 
  names(
    table(plei_tfs$class)[
      table(plei_tfs$class) >= 4
      ]
    )
plei_tfs_mainclasses <- sort(plei_tfs_mainclasses) # sort alphabetically
plei_tfs_mainclasses
##  [1] "A_20"        "bHLH"        "bZIP"        "CP2"         "CSD"        
##  [6] "CUT"         "DM"          "E2F"         "ETS"         "Forkhead"   
## [11] "HMG"         "Homeodomain" "HTH"         "IRF"         "MH1"        
## [16] "MYB"         "P53"         "PAX"         "Pou"         "RFX"        
## [21] "RHD"         "Runt"        "RXR_like"    "SAND"        "SRF"        
## [26] "T_box"       "TFII"        "THAP"        "THR_like"    "Tub"        
## [31] "ZBTB"        "zf_BED"      "zf_C2H2"     "zf_C4"       "zf_GATA"

ANOVA analysis of transcription factors

To find differences in TF class composition between cell clusters, we can resort to multivariate analysis two-way ANOVA to detect differences of TF expression between cell clusters, TF class, and the interaction of the two.

We will aggregate expression counts at the broad cell type level for this analysis.

broadtypes_sort <- list( # using here the one from the excel
  piwi = c(1,2,3),
  epidermis = c(4,5,6,7,8,9,10,11),
  gut = c(12,13,14,15,16,17,18,19,20),
  muscle = c(21,22,23,24),
  neuron = c(25,26,27,28,29,30,31),
  blood = c(32,33),
  polycystin = c(34,35),
  eleo = c(36,37,38),
  chaetal = c(39,40),
  lipox = 41,
  vigil = 42,
  lumbrok = 43,
  carb = 44,
  secretory = c(45,46),
  arg = 47,
  ldlrr = 48,
  metaneph = 49
)
plei_cpm_broad <- sapply(
  X = broadtypes_sort,
  function(x) {if(length(x) > 1){rowSums(plei_cpm[,x])} else {plei_cpm[,x]}} #maybe should be done with counts_norm?
)

We generate our dataset for ANOVA analysis

anova_data <- merge(
  plei_cpm_broad,
  plei_tfs,
  by.x = 0, by.y = 1)[,c(19,2:18)]

anova_data <- anova_data[anova_data$class %in% plei_tfs_mainclasses,]

anova_data <- reshape2::melt(anova_data)
## Using class as id variables
colnames(anova_data) <- c("class","ctype","counts")
anova_data$class <- factor(anova_data$class, levels = sort(unique(anova_data$class)))
anova_data$ctype <- 
  factor(
    anova_data$ctype,
    levels = colnames(plei_cpm_broad)
    )

anova_groupby <- group_by(anova_data, class, ctype) %>%
  summarise(
    count = n(),
    mean = mean(counts, na.rm = TRUE),
    sd = sd(counts, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'class'. You can override using the
## `.groups` argument.

A quick check of the looks of our dataset:

summary(anova_data)
##          class            ctype          counts         
##  zf_C2H2    :2754   piwi     : 615   Min.   :    0.000  
##  Homeodomain:1224   epidermis: 615   1st Qu.:    2.504  
##  bHLH       :1105   gut      : 615   Median :   23.605  
##  ZBTB       :1003   muscle   : 615   Mean   :   88.771  
##  zf_C4      : 612   neuron   : 615   3rd Qu.:   74.990  
##  Forkhead   : 442   blood    : 615   Max.   :10846.604  
##  (Other)    :3315   (Other)  :6765
summary(anova_groupby)
##      class           ctype         count             mean        
##  A_20   : 17   piwi     : 35   Min.   :  2.00   Min.   :   0.00  
##  bHLH   : 17   epidermis: 35   1st Qu.:  4.00   1st Qu.:  18.03  
##  bZIP   : 17   gut      : 35   Median :  6.00   Median :  43.22  
##  CP2    : 17   muscle   : 35   Mean   : 17.57   Mean   : 101.78  
##  CSD    : 17   neuron   : 35   3rd Qu.: 16.00   3rd Qu.: 110.30  
##  CUT    : 17   blood    : 35   Max.   :162.00   Max.   :1509.40  
##  (Other):493   (Other)  :385                                     
##        sd         
##  Min.   :   0.00  
##  1st Qu.:  24.09  
##  Median :  48.76  
##  Mean   : 130.81  
##  3rd Qu.: 124.32  
##  Max.   :2623.02  
## 

The anova analysis here

# Two-way ANOVA proper
res.aov2 <- aov(counts ~ ctype + class + ctype:class, data = anova_data)
summary(res.aov2)
##               Df    Sum Sq Mean Sq F value              Pr(>F)    
## ctype         16  71570079 4473130  63.123 <0.0000000000000002 ***
## class         34  44051561 1295634  18.284 <0.0000000000000002 ***
## ctype:class  544  67564552  124200   1.753 <0.0000000000000002 ***
## Residuals   9860 698712989   70863                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As we see there is significant interactions between TF class and cell cluster.

And an two-way interaction plot of the means of counts by class AND cell cluster.

library(effects)
## Loading required package: carData
## Use the command
##     lattice::trellis.par.set(effectsTheme())
##   to customize lattice options for effects plots.
## See ?effectsTheme for details.
interaction.plot(
  x.factor = anova_data$ctype, trace.factor = anova_data$class, 
  response = anova_data$counts, fun = mean, 
  type = "b", legend = TRUE,
  pch=c(1,19),
  col = rainbow(35)
  )

plot(
  allEffects(res.aov2),
  multiline=TRUE
  )

Tukey test to compare the means of TF counts by class, by cell cluster

# Tukey comparison of means
anova_comparisons_interaction <- TukeyHSD(res.aov2, which = "ctype:class")
anova_comparisons_interaction$`ctype:class` <- 
  anova_comparisons_interaction$`ctype:class`[
    anova_comparisons_interaction$`ctype:class`[,4] < 0.05 ,
    ]

From the interaction test, we extract the significant results for significant differences between the same TF class across different cell clusters.

# put this all together with dplyr ...
# also think this better and implement a FUNCTION
anova_comparisons_interaction_DF <- data.frame(
  comparison = rownames(anova_comparisons_interaction$`ctype:class`),
  as.data.frame(
  anova_comparisons_interaction$`ctype:class`
  )
)
anova_comparisons_interaction_DF <- data.frame(
  anova_comparisons_interaction_DF,
  A = sub("-..*", "", anova_comparisons_interaction_DF$comparison),
  B = sub("..*-", "", anova_comparisons_interaction_DF$comparison)
)

anova_comparisons_interaction_DF$class_A <- 
  sub(".*:","",anova_comparisons_interaction_DF$A)
anova_comparisons_interaction_DF$class_B <- 
  sub(".*:","",anova_comparisons_interaction_DF$B)

anova_comparisons_interaction_DF <- 
  anova_comparisons_interaction_DF[
  anova_comparisons_interaction_DF$class_A == 
    anova_comparisons_interaction_DF$class_B ,
  ]

table(anova_comparisons_interaction_DF$class_A) # or class_B, its the same anyway
## 
##        bHLH        bZIP         CSD         ETS         HMG Homeodomain 
##          10          39          46          29          14          12 
##         MYB     zf_C2H2       zf_C4 
##           5          17           1
plei_tfs_signif_class <- names(table(anova_comparisons_interaction_DF$class_A))

A quick visualisation of what are the TF classes that explain the most differences between cell clusters:

barplot(
  sort(table(anova_comparisons_interaction_DF$class_A)), # or class_B, its the same anyway
  col = rev(brewer.pal(9,"Spectral")),
  las = 2
  )

And we see that all these classes are already in the set of TF classes we will be exploring

plei_tfs_signif_class %in% plei_tfs_mainclasses
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

Visualising TF class prevalence across cell clusters

To visualise these differences in TF abundance at a TF class level, we retrieve the expression matrix at the class level. Below is a quick look too at the table (last columns, including class)

plei_tfs_cpm_topclass <- 
  merge(
    plei_tfs_cpm,
    plei_tfs,
    by.x = 0,
    by.y = 1,
  ) %>% 
  column_to_rownames("Row.names") %>%
  filter(class %in% plei_tfs_mainclasses)

head(plei_tfs_cpm_topclass[,45:50])
##                   34_secretory_1 44_secretory_2 32_arginase_pos_cells
## PrileiEVm000032t1       13.97038       39.97282               96.8828
## PrileiEVm000093t1        0.00000        0.00000                0.0000
## PrileiEVm000118t1       83.82230       39.97282               24.2207
## PrileiEVm000154t1       83.82230       79.94564              121.1035
## PrileiEVm000159t1       13.97038        0.00000               72.6621
## PrileiEVm000175t1        0.00000       79.94564               96.8828
##                   35_ldlrr_pos_cells 37_metanephridia   class
## PrileiEVm000032t1           33.62927         82.42664 zf_C2H2
## PrileiEVm000093t1           33.62927          0.00000 zf_C2H2
## PrileiEVm000118t1          168.14635        164.85328     MYB
## PrileiEVm000154t1          168.14635        274.75547    bHLH
## PrileiEVm000159t1           33.62927          0.00000    bHLH
## PrileiEVm000175t1           67.25854          0.00000     CSD

We will generate a matrix to count how many genes of each TF class are expressed in each cell cluster.

plei_tfs_ngenes <- 
  apply(
    plei_tfs_cpm_topclass[1:49],
    2,
    function(x){ ifelse(x > 0, 1, 0) }
    )
row.names(plei_tfs_ngenes) <- plei_tfs_cpm_topclass$class

plei_tfs_ngenes <-
  aggregate(
    plei_tfs_ngenes[, 1:49],
    by = list(Category = rownames(plei_tfs_ngenes)),
    FUN = sum) %>%
  arrange(Category) %>%
  column_to_rownames("Category")

We also generate a table of all the cpms per class and per cell cluster. We will aggregate (sum) all cpms by TF class, for each cell cluster separately. (We will also generate a normalised )

plei_tfs_expgenes <-
  aggregate(
    plei_tfs_cpm_topclass[, 1:49],
    by = list(Category = plei_tfs_cpm_topclass$class),
    FUN = sum) %>%
  arrange(Category) %>%
  column_to_rownames("Category")

We divide the cpms/class by the numexp/class, thus retrieving the cpms per class normalised by the number of expressed genes from a given class.

plei_tfs_expngenes <- 
  plei_tfs_expgenes / plei_tfs_ngenes
plei_tfs_expngenes <- apply(plei_tfs_expngenes, 2, function(x){ifelse(is.nan(x),0,x)})

plei_tf_EXPNGEN <- 
  apply(plei_tfs_expngenes, 2, function(x) {
    x / sum(x)
  })

A barplot to show the prominence/prevalence of TF classes on each cell cluster.

Saving the data

We will save the important bits for further analysis in the rest of markdowns.

save(
  # cell type table(s)
  plei_counts,
  plei_ctypes,
  plei_ctypes_col,
  # gene expression data
  plei_cpm,
  plei_tfs_cpm,
  plei_tfs_cpm_topclass,
  # tf data
  plei_tfs,
  plei_tfs_mainclasses,
  plei_tfs_signif_class,
  # anova analysis
  res.aov2,
  anova_comparisons_interaction,
  anova_comparisons_interaction_DF,
  # visual annotations
  ctypes_rowAnno,
  clu_ha,
  #pick color palette for TFs
  # destination
  file = paste0(
    "./outputs/rda/",
    "02_tf_analysis.rda"
  )
)